Load required libraries.

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(grid)
library(wordbankr)
library(langcog)
theme_set(theme_mikabr())

Load in Wordbank data.

items <- get_item_data() %>%
  filter(type == "word") %>%
  mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))

Get vocabulary composition data for all languages.

get_vocab_comp <- function(input_language, input_form) {
  
  lang_vocab_items <- filter(items, language == input_language, form == input_form) %>%
    filter(lexical_category %in% c("nouns", "predicates", "function_words"))
  
  lang_vocab_data <- get_instrument_data(instrument_language = input_language,
                                         instrument_form = input_form,
                                         items = lang_vocab_items$item_id,
                                         iteminfo = lang_vocab_items) %>%
    mutate(value = ifelse(is.na(value), "", value),
           produces = value == "produces",
           understands = value == "produces" | value == "understands") %>%
    select(-value) %>%
    gather(measure, value, produces, understands)
  
  num_words <- nrow(lang_vocab_items)
  
  lang_vocab_summary <- lang_vocab_data %>%
    group_by(data_id, measure, lexical_category) %>%
    summarise(num_true = sum(value),
              prop = sum(value) / n())
  
  lang_vocab_sizes <- lang_vocab_summary %>%
    summarise(vocab = sum(num_true) / num_words)
  
  lang_vocab_summary %>%
    left_join(lang_vocab_sizes) %>%
    select(-num_true) %>%
    mutate(language = input_language, form = input_form)
  
}
instruments <- items %>%
  select(language, form) %>%
  distinct()

vocab_comp_data <- map2(instruments$language, instruments$form, get_vocab_comp) %>%
  bind_rows()

Show sample size of each instrument.

sample_sizes <- vocab_comp_data %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  select(language, form, n) %>%
  distinct()
kable(sample_sizes)
language form n
British Sign Language WG 161
Cantonese WS 987
Croatian WG 250
Croatian WS 377
Danish WS 3714
English WG 2454
English WS 5824
German WS 1183
Hebrew WG 62
Hebrew WS 253
Italian WG 648
Italian WS 752
Mandarin TC 652
Mandarin WS 1056
Norwegian WG 3025
Norwegian WS 12969
Russian WG 768
Russian WS 1037
Spanish WG 778
Spanish WS 1094
Swedish WG 474
Swedish WS 900
Turkish WG 1115
Turkish WS 2422

Base plot for looking at vocabulary composition.

base_plot <- function(input_form, input_measure) {
  vocab_comp_data %>%
    filter(form == input_form, measure == input_measure, language != "Hebrew") %>%
    mutate(lexical_category = factor(lexical_category,
                                     levels = c("nouns", "predicates", "function_words"),
                                     labels = c("Nouns", "Predicates", "Function Words"))) %>%
    ggplot(aes(x = vocab, y = prop, colour = lexical_category)) +
    facet_wrap(~language, ncol = 3) +
    geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "Proportion of Category\n") +
    scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "\nVocabulary Size") +
    scale_color_solarized(name = "") + # "Lexical Category") +
    theme(legend.position = c(0.068, 0.95),
          #legend.text = element_text(size = 9),
          legend.title = element_text(lineheight = unit(0.8, "char")), #size = 9),
          legend.key.height = unit(0.8, "char"),
          #legend.key.width = unit(0.3, "cm"),
          legend.key = element_blank(),
          legend.background = element_rect(fill = "transparent"))
}

Plot WS productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WS", "produces") + geom_jitter(size = 0.7)

Plot WG productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "produces") + geom_jitter(size = 0.7)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "understands") + geom_jitter(size = 0.7)

Plot WS productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WS", "produces") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Plot WG productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "produces") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "understands") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

base_plot("WS", "produces") +
  geom_jitter(size = 0.7, alpha = 0.5) +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1, size = 1) +
  theme_mikabr(base_size = 20) +
  theme(legend.position = "top")

#ggsave("BUCLD/data_models.png", width = 15, height = 17, dpi = 300)
num_admins <- vocab_comp_data %>%
  filter(form == "WS") %>%
  distinct(data_id) %>%
  group_by(language) %>%
  summarise(N = n())

num_items <- items %>%
  filter(form == "WS", lexical_category %in% c("nouns", "predicates", "function_words")) %>%
  group_by(language, lexical_category) %>%
  summarise(n = n()) %>%
  spread(lexical_category, n)

sample_sizes <- num_admins %>%
  left_join(num_items) %>%
  select(language, N, nouns, predicates, function_words) %>%
  rename(Language = language, Nouns = nouns, Predicates = predicates, `Function Words` = function_words)
kable(sample_sizes)
Language N Nouns Predicates Function Words
Cantonese 987 316 256 108
Croatian 377 312 166 139
Danish 3714 316 166 128
English 5824 312 166 102
German 1183 270 154 93
Hebrew 253 322 151 49
Italian 752 312 166 92
Mandarin 1056 322 260 113
Norwegian 12969 316 170 132
Russian 1037 314 182 102
Spanish 1094 312 166 102
Swedish 900 339 167 97
Turkish 2422 297 207 84
#png("BUCLD/sample_sizes.png", width = 1080/72, height = 540/72, res = 300, units = "in")
#textplot(sample_sizes, show.rownames = FALSE, mar = c(0, 0, 0, 0), cmar = 4)
#dev.off()

Function for resampling data.

sample_areas <- function(d, nboot = 1000) {
  
  poly_area <- function(group_data) {
    model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1,
                data = group_data)
    return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1])
  }
  
  counter <- 1
  sample_area <- function(d) {
    d_frame <- d %>%
      group_by(language, form, measure) %>%
      sample_frac(replace = TRUE) %>%
      group_by(language, form, measure, lexical_category) %>%
      do(area = poly_area(.)) %>%
      mutate(area = area[1]) %>%
      rename_(.dots = setNames("area", counter))
    
    counter <<- counter + 1 # increment counter outside scope
    return(d_frame)
  }
  
  areas <- replicate(nboot, sample_area(d), simplify = FALSE)
  
  Reduce(left_join, areas) %>%
    gather(sample, area, -language, -form, -measure, -lexical_category)
}

Resample data, compute area for each sample, find the mean and CI of the area estimate.

areas <- sample_areas(vocab_comp_data, 1000)

area_summary <- areas %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(mean =  mean(area),
            ci_lower = ci_lower(area),
            ci_upper = ci_upper(area)) %>%
  ungroup() %>%
  mutate(language = factor(language),
         instrument = paste(language, form))

area_order <- filter(area_summary, form == "WS", measure == "produces",
                     lexical_category == "nouns")
language_levels <- area_order$language[order(area_order$mean,
                                             area_order$language,
                                             decreasing = FALSE)]

area_summary_ordered <- area_summary %>%
  filter(form %in% c("WS", "WG"),
         !(form == "WS" & measure == "understands")) %>%
  ungroup() %>%
  mutate(language = factor(language, levels = language_levels),
         lexical_category = factor(lexical_category,
                                   levels = c("nouns", "predicates", "function_words"),
                                   labels = c("Nouns", "Predicates", "Function Words")))

Plot each lexical category’s area estimate by language, form, and measure.

ggplot(area_summary_ordered,
       aes(y = language, x = mean, colour = lexical_category)) +
  facet_grid(lexical_category ~ form + measure) +
  geom_point() +
  geom_segment(aes(x = ci_lower, xend = ci_upper,
                   y = language, yend = language)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  scale_colour_solarized(name = "", guide = FALSE) +
  scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
  xlab("\nRelative representation in early vocabulary")

Plot each lexical category’s area estimate by language and measure for WS only.

ggplot(filter(area_summary_ordered, form == "WS"),
       aes(y = language, x = mean, col = lexical_category)) +
  facet_grid(. ~ lexical_category) +
  geom_point() +
  geom_segment(aes(x = ci_lower, xend = ci_upper,
                   y = language, yend = language)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  scale_colour_solarized(name = "", guide = FALSE) +
  scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
  xlab("\nRelative representation in early vocabulary") +
  theme_mikabr(base_size = 20)

#ggsave("BUCLD/diffs.png", width = 1080/72, height = 540/72)

Demo plots of English and Mandarin data and models.

demo_langs <- c("English", "Mandarin")
demo_data <- filter(vocab_comp_data, form == "WS", language %in% demo_langs) %>%
  mutate(panel = paste(language, "(data)"),
         lexical_category = factor(lexical_category,
                                   levels = c("nouns", "predicates", "function_words"),
                                   labels = c("Nouns", "Predicates", "Function Words")))
pts <- seq(0, 1, 0.01)

models <- demo_data %>%
  group_by(language, lexical_category) %>%
  do(model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1, data = .))

get_lang_lexcat_predictions <- function(lang, lexcat) {
  model <- filter(models, language == lang, lexical_category == lexcat)$model[[1]]
  data.frame(vocab = pts,
             prop = predict(model, newdata = data.frame(vocab = pts)),
             lexical_category = lexcat,
             language = lang)
}

get_lang_predictions <- function(lang) {
  bind_rows(sapply(unique(demo_data$lexical_category),
                   function(lexcat) get_lang_lexcat_predictions(lang, lexcat),
                   simplify = FALSE))
}

predictions <- bind_rows(sapply(demo_langs, get_lang_predictions, simplify = FALSE))

diagonal <- expand.grid(vocab = rep(rev(pts)),
                        language = demo_langs,
                        lexical_category = unique(demo_data$lexical_category))
diagonal$prop <- diagonal$vocab

area_poly <- bind_rows(predictions, diagonal) %>%
  mutate(panel = paste(language, "(models)"))
ggplot(demo_data,
       aes(x = vocab, y = prop, colour = lexical_category, fill = lexical_category)) +
  facet_grid(~ panel) +
  geom_point(size = 0.7) +
  geom_polygon(data = area_poly, alpha = 0.2) +
  geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "\nVocabulary Size") +
  scale_color_solarized(guide = FALSE) +
  scale_fill_solarized(name = "") +
  theme(legend.position = c(0.061, 0.91),
        legend.text = element_text(size = 8),
        legend.key.height = unit(0.9, "char"),
        legend.key.width = unit(0.88, "char"),
        legend.background = element_rect(fill = "transparent"))